-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[llvm-exegesis] [AArch64] Add support for Load Instructions in subprocess execution mode #144895
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[llvm-exegesis] [AArch64] Add support for Load Instructions in subprocess execution mode #144895
Conversation
|
@llvm/pr-subscribers-tools-llvm-exegesis Author: Lakshay Kumar (lakshayk-nv) ChangesWe want to support load and store instructions for aarch64 but currently they throw segmentation fault as register are initialized by 0 but load instruction requires register to be loaded with valid memory address. Load registers requiring memory address
Adding memory setup, required by subprocess for AArch64. TODO: how to differentiate between register requiring address and otherwise ? Enable load instructions in --mode=subprocess
Please review: @sjoerdmeijer, @boomanaiden154, @davemgreen Patch is 20.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144895.diff 4 Files Affected:
diff --git a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
index a4350fc6dc2cb..d0dc5c744ab80 100644
--- a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
+++ b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
@@ -70,6 +70,6 @@ RUN: llvm-objdump -d %d > %t.s
RUN: FileCheck %s --check-prefix=FPCR-ASM < %t.s
FPCR-ASM: <foo>:
FPCR-ASM: movi d{{[0-9]+}}, #0000000000000000
-FPCR-ASM-NEXT: mov x8, #0x0
-FPCR-ASM-NEXT: msr FPCR, x8
+FPCR-ASM-NEXT: mov x16, #0x0
+FPCR-ASM-NEXT: msr FPCR, x16
FPCR-ASM-NEXT: bfcvt h{{[0-9]+}}, s{{[0-9]+}}
diff --git a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
index a1eb5a46f21fc..df9fbf6946005 100644
--- a/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
@@ -6,18 +6,28 @@
//
//===----------------------------------------------------------------------===//
#include "../Target.h"
+#include "../Error.h"
+#include "../MmapUtils.h"
+#include "../SerialSnippetGenerator.h"
+#include "../SnippetGenerator.h"
+#include "../SubprocessMemory.h"
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include <vector>
+#define DEBUG_TYPE "exegesis-aarch64-target"
#if defined(__aarch64__) && defined(__linux__)
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h> // for getpagesize()
+#ifdef HAVE_LIBPFM
+#include <perfmon/perf_event.h>
+#endif // HAVE_LIBPFM
#include <linux/prctl.h> // For PR_PAC_* constants
#include <sys/prctl.h>
-#ifndef PR_PAC_SET_ENABLED_KEYS
-#define PR_PAC_SET_ENABLED_KEYS 60
-#endif
-#ifndef PR_PAC_GET_ENABLED_KEYS
-#define PR_PAC_GET_ENABLED_KEYS 61
-#endif
#ifndef PR_PAC_APIAKEY
#define PR_PAC_APIAKEY (1UL << 0)
#endif
@@ -38,47 +48,6 @@
namespace llvm {
namespace exegesis {
-bool isPointerAuth(unsigned Opcode) {
- switch (Opcode) {
- default:
- return false;
-
- // FIXME: Pointer Authentication instructions.
- // We would like to measure these instructions, but they can behave
- // differently on different platforms, and maybe the snippets need to look
- // different for these instructions,
- // Platform-specific handling: On Linux, we disable authentication, may
- // interfere with measurements. On non-Linux platforms, disable opcodes for
- // now.
- case AArch64::AUTDA:
- case AArch64::AUTDB:
- case AArch64::AUTDZA:
- case AArch64::AUTDZB:
- case AArch64::AUTIA:
- case AArch64::AUTIA1716:
- case AArch64::AUTIASP:
- case AArch64::AUTIAZ:
- case AArch64::AUTIB:
- case AArch64::AUTIB1716:
- case AArch64::AUTIBSP:
- case AArch64::AUTIBZ:
- case AArch64::AUTIZA:
- case AArch64::AUTIZB:
- return true;
- }
-}
-
-bool isLoadTagMultiple(unsigned Opcode) {
- switch (Opcode) {
- default:
- return false;
-
- // Load tag multiple instruction
- case AArch64::LDGM:
- return true;
- }
-}
-
static unsigned getLoadImmediateOpcode(unsigned RegBitWidth) {
switch (RegBitWidth) {
case 32:
@@ -120,7 +89,7 @@ static MCInst loadPPRImmediate(MCRegister Reg, unsigned RegBitWidth,
// Generates instructions to load an immediate value into an FPCR register.
static std::vector<MCInst>
loadFPCRImmediate(MCRegister Reg, unsigned RegBitWidth, const APInt &Value) {
- MCRegister TempReg = AArch64::X8;
+ MCRegister TempReg = AArch64::X16;
MCInst LoadImm = MCInstBuilder(AArch64::MOVi64imm).addReg(TempReg).addImm(0);
MCInst MoveToFPCR =
MCInstBuilder(AArch64::MSR).addImm(AArch64SysReg::FPCR).addReg(TempReg);
@@ -153,6 +122,89 @@ static MCInst loadFPImmediate(MCRegister Reg, unsigned RegBitWidth,
return Instructions;
}
+static void generateRegisterStackPush(unsigned int RegToPush,
+ std::vector<MCInst> &GeneratedCode,
+ int imm = -16) {
+ // STR [X|W]t, [SP, #simm]!: SP is decremented by default 16 bytes
+ // before the store to maintain 16-bytes alignment.
+ if (AArch64::GPR64RegClass.contains(RegToPush)) {
+ GeneratedCode.push_back(MCInstBuilder(AArch64::STRXpre)
+ .addReg(AArch64::SP)
+ .addReg(RegToPush)
+ .addReg(AArch64::SP)
+ .addImm(imm));
+ } else if (AArch64::GPR32RegClass.contains(RegToPush)) {
+ GeneratedCode.push_back(MCInstBuilder(AArch64::STRWpre)
+ .addReg(AArch64::SP)
+ .addReg(RegToPush)
+ .addReg(AArch64::SP)
+ .addImm(imm));
+ } else {
+ llvm_unreachable("Unsupported register class for stack push");
+ }
+}
+
+static void generateRegisterStackPop(unsigned int RegToPopTo,
+ std::vector<MCInst> &GeneratedCode,
+ int imm = 16) {
+ // LDR Xt, [SP], #simm: SP is incremented by default 16 bytes after the load.
+ if (AArch64::GPR64RegClass.contains(RegToPopTo)) {
+ GeneratedCode.push_back(MCInstBuilder(AArch64::LDRXpost)
+ .addReg(AArch64::SP)
+ .addReg(RegToPopTo)
+ .addReg(AArch64::SP)
+ .addImm(imm));
+ } else if (AArch64::GPR32RegClass.contains(RegToPopTo)) {
+ GeneratedCode.push_back(MCInstBuilder(AArch64::LDRWpost)
+ .addReg(AArch64::SP)
+ .addReg(RegToPopTo)
+ .addReg(AArch64::SP)
+ .addImm(imm));
+ } else {
+ llvm_unreachable("Unsupported register class for stack pop");
+ }
+}
+
+void generateSysCall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
+ GeneratedCode.push_back(
+ loadImmediate(AArch64::X8, 64, APInt(64, SyscallNumber)));
+ GeneratedCode.push_back(MCInstBuilder(AArch64::SVC).addImm(0));
+}
+
+/// Functions to save/restore system call registers
+#ifdef __linux__
+constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
+ AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+};
+
+static void saveSysCallRegisters(std::vector<MCInst> &GeneratedCode,
+ unsigned ArgumentCount) {
+ // AArch64 Linux typically uses X0-X5 for the first 6 arguments.
+ // Some syscalls can take up to 8 arguments in X0-X7.
+ assert(ArgumentCount <= 6 &&
+ "This implementation saves up to 6 argument registers (X0-X5)");
+ // generateRegisterStackPush(ArgumentRegisters::TempRegister, GeneratedCode);
+ // Preserve X8 (used for the syscall number/return value).
+ generateRegisterStackPush(AArch64::X8, GeneratedCode);
+ // Preserve the registers used to pass arguments to the system call.
+ for (unsigned I = 0; I < ArgumentCount; ++I) {
+ generateRegisterStackPush(SyscallArgumentRegisters[I], GeneratedCode);
+ }
+}
+
+static void restoreSysCallRegisters(std::vector<MCInst> &GeneratedCode,
+ unsigned ArgumentCount) {
+ assert(ArgumentCount <= 6 &&
+ "This implementation restores up to 6 argument registers (X0-X5)");
+ // Restore argument registers, in opposite order of the way they are saved.
+ for (int I = ArgumentCount - 1; I >= 0; --I) {
+ generateRegisterStackPop(SyscallArgumentRegisters[I], GeneratedCode);
+ }
+ generateRegisterStackPop(AArch64::X8, GeneratedCode);
+ // generateRegisterStackPop(ArgumentRegisters::TempRegister, GeneratedCode);
+}
+#endif // __linux__
#include "AArch64GenExegesis.inc"
namespace {
@@ -162,7 +214,39 @@ class ExegesisAArch64Target : public ExegesisTarget {
ExegesisAArch64Target()
: ExegesisTarget(AArch64CpuPfmCounters, AArch64_MC::isOpcodeAvailable) {}
+ enum ArgumentRegisters {
+ CodeSize = AArch64::X12,
+ AuxiliaryMemoryFD = AArch64::X13,
+ TempRegister = AArch64::X16,
+ };
+
+ std::vector<MCInst> _generateRegisterStackPop(MCRegister Reg,
+ int imm = 0) const override {
+ std::vector<MCInst> Insts;
+ if (AArch64::GPR32RegClass.contains(Reg) ||
+ AArch64::GPR64RegClass.contains(Reg)) {
+ generateRegisterStackPop(Reg, Insts, imm);
+ return Insts;
+ }
+ return {};
+ }
+
private:
+#ifdef __linux__
+ std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override;
+ std::vector<MCInst>
+ generateMmap(uintptr_t Address, size_t Length,
+ uintptr_t FileDescriptorAddress) const override;
+ void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override;
+ std::vector<MCInst> generateMemoryInitialSetup() const override;
+ std::vector<MCInst> setStackRegisterToAuxMem() const override;
+ uintptr_t getAuxiliaryMemoryStartAddress() const override;
+ std::vector<MCInst> configurePerfCounter(long Request,
+ bool SaveRegisters) const override;
+ std::vector<MCRegister> getArgumentRegisters() const override;
+ std::vector<MCRegister> getRegistersNeedSaving() const override;
+#endif // __linux__
+
std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
const APInt &Value) const override {
if (AArch64::GPR32RegClass.contains(Reg))
@@ -199,37 +283,157 @@ class ExegesisAArch64Target : public ExegesisTarget {
PM.add(createAArch64ExpandPseudoPass());
}
- const char *getIgnoredOpcodeReasonOrNull(const LLVMState &State,
- unsigned Opcode) const override {
- if (const char *Reason =
- ExegesisTarget::getIgnoredOpcodeReasonOrNull(State, Opcode))
- return Reason;
+} // namespace
- if (isPointerAuth(Opcode)) {
-#if defined(__aarch64__) && defined(__linux__)
- // Disable all PAC keys. Note that while we expect the measurements to
- // be the same with PAC keys disabled, they could potentially be lower
- // since authentication checks are bypassed.
- if (prctl(PR_PAC_SET_ENABLED_KEYS,
- PR_PAC_APIAKEY | PR_PAC_APIBKEY | PR_PAC_APDAKEY |
- PR_PAC_APDBKEY, // all keys
- 0, // disable all
- 0, 0) < 0) {
- return "Failed to disable PAC keys";
- }
-#else
- return "Unsupported opcode: isPointerAuth";
-#endif
- }
+#ifdef __linux__
+// true : let use of fixed address to Virtual Address Space Ceiling
+// false: let kernel choose the address of the auxiliary memory
+bool UseFixedAddress = true;
+
+static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
- if (isLoadTagMultiple(Opcode))
- return "Unsupported opcode: load tag multiple";
+static void generateRoundToNearestPage(unsigned int TargetRegister,
+ std::vector<MCInst> &GeneratedCode) {
+ int PageSizeShift = static_cast<int>(round(log2(getpagesize())));
+ // Round down to the nearest page by getting rid of the least significant bits
+ // representing location in the page.
+
+ // Single instruction using AND with inverted mask (effectively BIC)
+ uint64_t BitsToClearMask = (1ULL << PageSizeShift) - 1; // 0xFFF
+ uint64_t AndMask = ~BitsToClearMask; // ...FFFFFFFFFFFF000
+ GeneratedCode.push_back(MCInstBuilder(AArch64::ANDXri)
+ .addReg(TargetRegister) // Xd
+ .addReg(TargetRegister) // Xn
+ .addImm(AndMask) // imm bitmask
+ );
+}
- return nullptr;
+std::vector<MCInst>
+ExegesisAArch64Target::generateExitSyscall(unsigned ExitCode) const {
+ std::vector<MCInst> ExitCallCode;
+ ExitCallCode.push_back(loadImmediate(AArch64::X0, 64, APInt(64, ExitCode)));
+ generateSysCall(SYS_exit, ExitCallCode); // SYS_exit is 93
+ return ExitCallCode;
+}
+
+std::vector<MCInst>
+ExegesisAArch64Target::generateMmap(uintptr_t Address, size_t Length,
+ uintptr_t FileDescriptorAddress) const {
+ // mmap(address, length, prot, flags, fd, offset=0)
+ int flags = MAP_SHARED;
+ if (Address != 0) {
+ flags |= MAP_FIXED_NOREPLACE;
}
-};
+ std::vector<MCInst> MmapCode;
+ MmapCode.push_back(
+ loadImmediate(AArch64::X0, 64, APInt(64, Address))); // map adr
+ MmapCode.push_back(
+ loadImmediate(AArch64::X1, 64, APInt(64, Length))); // length
+ MmapCode.push_back(loadImmediate(AArch64::X2, 64,
+ APInt(64, PROT_READ | PROT_WRITE))); // prot
+ MmapCode.push_back(loadImmediate(AArch64::X3, 64, APInt(64, flags))); // flags
+ // FIXME: File descriptor address is not initialized.
+ // Copy file descriptor location from aux memory into X4
+ MmapCode.push_back(
+ loadImmediate(AArch64::X4, 64, APInt(64, FileDescriptorAddress))); // fd
+ // Dereference file descriptor into FD argument register
+ // MmapCode.push_back(MCInstBuilder(AArch64::LDRWui)
+ // .addReg(AArch64::W4) // Destination register
+ // .addReg(AArch64::X4) // Base register (address)
+ // .addImm(0)); // Offset (-byte words)
+ // FIXME: This is not correct.
+ MmapCode.push_back(loadImmediate(AArch64::X5, 64, APInt(64, 0))); // offset
+ generateSysCall(SYS_mmap, MmapCode); // SYS_mmap is 222
+ return MmapCode;
+}
-} // namespace
+void ExegesisAArch64Target::generateMmapAuxMem(
+ std::vector<MCInst> &GeneratedCode) const {
+ int fd = -1;
+ int flags = MAP_SHARED;
+ uintptr_t address = getAuxiliaryMemoryStartAddress();
+ if (fd == -1)
+ flags |= MAP_ANONYMOUS;
+ if (address != 0)
+ flags |= MAP_FIXED_NOREPLACE;
+ int prot = PROT_READ | PROT_WRITE;
+
+ GeneratedCode.push_back(
+ loadImmediate(AArch64::X0, 64, APInt(64, address))); // map adr
+ GeneratedCode.push_back(loadImmediate(
+ AArch64::X1, 64,
+ APInt(64, SubprocessMemory::AuxiliaryMemorySize))); // length
+ GeneratedCode.push_back(
+ loadImmediate(AArch64::X2, 64, APInt(64, prot))); // prot
+ GeneratedCode.push_back(
+ loadImmediate(AArch64::X3, 64, APInt(64, flags))); // flags
+ GeneratedCode.push_back(loadImmediate(AArch64::X4, 64, APInt(64, fd))); // fd
+ GeneratedCode.push_back(
+ loadImmediate(AArch64::X5, 64, APInt(64, 0))); // offset
+ generateSysCall(SYS_mmap, GeneratedCode); // SYS_mmap is 222
+}
+
+std::vector<MCInst> ExegesisAArch64Target::generateMemoryInitialSetup() const {
+ std::vector<MCInst> MemoryInitialSetupCode;
+ generateMmapAuxMem(MemoryInitialSetupCode); // FIXME: Uninit file descriptor
+
+ // If using fixed address for auxiliary memory skip this step,
+ // When using dynamic memory allocation (non-fixed address), we must preserve
+ // the mmap return value (X0) which contains the allocated memory address.
+ // This value is saved to the stack to ensure registers requiring memory
+ // access can retrieve the correct address even if X0 is modified by
+ // intermediate code.
+ generateRegisterStackPush(AArch64::X0, MemoryInitialSetupCode);
+ // FIXME: Ensure stack pointer remains stable to prevent loss of saved address
+ return MemoryInitialSetupCode;
+}
+
+std::vector<MCInst> ExegesisAArch64Target::setStackRegisterToAuxMem() const {
+ std::vector<MCInst> instructions; // NOP
+ // TODO: Implement this, Found no need for this in AArch64.
+ return instructions;
+}
+
+uintptr_t ExegesisAArch64Target::getAuxiliaryMemoryStartAddress() const {
+ if (!UseFixedAddress)
+ // Allow kernel to select an appropriate memory address
+ return 0;
+ // Return the second to last page in the virtual address space
+ // to try and prevent interference with memory annotations in the snippet
+ // VAddressSpaceCeiling = 0x0000800000000000
+ // FIXME: Why 2 pages?
+ return VAddressSpaceCeiling - (2 * getpagesize());
+}
+
+std::vector<MCInst>
+ExegesisAArch64Target::configurePerfCounter(long Request,
+ bool SaveRegisters) const {
+ std::vector<MCInst> ConfigurePerfCounterCode; // NOP
+ // FIXME: SYSCALL exits with EBADF error - file descriptor is invalid
+ // No file is opened previosly to add as file descriptor
+ return ConfigurePerfCounterCode;
+}
+
+std::vector<MCRegister> ExegesisAArch64Target::getArgumentRegisters() const {
+ return {AArch64::X0, AArch64::X1};
+}
+
+std::vector<MCRegister> ExegesisAArch64Target::getRegistersNeedSaving() const {
+ return {
+ AArch64::X0,
+ AArch64::X1,
+ AArch64::X2,
+ AArch64::X3,
+ AArch64::X4,
+ AArch64::X5,
+ AArch64::X8,
+ ArgumentRegisters::TempRegister,
+ ArgumentRegisters::CodeSize,
+ ArgumentRegisters::AuxiliaryMemoryFD,
+ };
+}
+
+#endif // __linux__
static ExegesisTarget *getTheExegesisAArch64Target() {
static ExegesisAArch64Target Target;
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index fd7924db08441..a73eaf76a46d7 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -66,6 +66,8 @@ static bool generateSnippetSetupCode(const ExegesisTarget &ET,
assert(MM.Address % getpagesize() == 0 &&
"Memory mappings need to be aligned to page boundaries.");
#endif
+ // FIXME: file descriptor for aux memory seems not initialized.
+ // TODO: Invoke openat syscall to get correct fd for aux memory
const MemoryValue &MemVal = Key.MemoryValues.at(MM.MemoryValueName);
BBF.addInstructions(ET.generateMmap(
MM.Address, MemVal.SizeBytes,
@@ -78,15 +80,47 @@ static bool generateSnippetSetupCode(const ExegesisTarget &ET,
Register StackPointerRegister = BBF.MF.getSubtarget()
.getTargetLowering()
->getStackPointerRegisterToSaveRestore();
+#define DEBUG_TYPE "register-initial-values"
+ // FIXME: Only loading first register with memory address is hacky.
+ bool isFirstRegister = true;
for (const RegisterValue &RV : Key.RegisterInitialValues) {
+ // Debug: register name and class name and value from BenchmarkKey
+ const MCRegisterInfo *RegInfo = BBF.MF.getTarget().getMCRegisterInfo();
+ const char *RegName = RegInfo->getName(RV.Register);
+ const char *regClassName = "Unknown";
+ for (unsigned i = 0, e = RegInfo->getNumRegClasses(); i < e; ++i) {
+ const MCRegisterClass &RC = RegInfo->getRegClass(i);
+ if (RC.contains(RV.Register)) {
+ regClassName = RegInfo->getRegClassName(&RC);
+ break;
+ }
+ }
+ LLVM_DEBUG(
+ dbgs() << "Setting register (Class: " << regClassName << ") " << RegName
+ << std::string(
+ std::max(0, 3 - static_cast<int>(strlen(RegName))), ' '));
+
if (GenerateMemoryInstructions) {
// If we're generating memory instructions, don't load in the value for
// the register with the stack pointer as it will be used later to finish
// the setup.
if (Register(RV.Register) == StackPointerRegister)
continue;
+#if defined(__aarch64__)
+ auto StackLoadInsts = ET._generateRegisterStackPop(RV.Register, 16);
+ if (!StackLoadInsts.empty() && isFirstRegister) {
+ for (const auto &Inst : StackLoadInsts)
+ BBF.addInstruction(Inst);
+ isFirstRegister = false;
+ LLVM_DEBUG(dbgs() << "from stack with post-increment offset of " << 16
+ << " bytes\n");
+ continue;
+ }
+#endif
}
// Load a constant in the register.
+ LLVM_DEBUG(dbgs() << " to " << RV.Value << "\n");
+#undef DEBUG_TYPE
const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value);
if (SetRegisterCode.empty())
IsSnippetSetupComplete = false;
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 77fbaa6e95412..736c9d9ff6c23 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -308,6 +308,10 @@ class ExegesisTarget {
return std::make_unique<SavedState>();
}
+ virtual st...
[truncated]
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add test-cases please?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
An assert with a message is probably better than silently returning something.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added warning for unimplemented setStackRegisterToAuxMem .
Assembler.cpp calls this function. X86 has implemented this, previously similarly i had implemented but didn't understood reasoning behind moving stack pointer to AuxMem address.
Thus, reverted those changes to rather include a warning about unimplemented setStackRegisterToAuxMem
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the first argument to the mmap syscall isn't it? If the address is 0, i.e. not specified, then MMAP returns an address? This could be clarified. But either way, why do we only use the fixed addresses? And is there any value in having this boolean here? Do we need it, or is it clearer to just get rid of it if we only use the fixed addresses anyway?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mmap always returns an address. When you pass an address, there is no guarantee that will actually be the address of the mapping. The actual address of the mapping (or an error code) gets returned.
Exegesis currently only has support for mappings at a fixed address.
Given this flag is always set to true, we should just get rid of it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given fixed address flag is always set to true, we should just get rid of it.
Removed fixed flag and its code-flow. Code was previously, added to test guaranteed mmap (ensure by kernel but not to specific location known beforehand).
Exegesis currently only has support for mappings at a fixed address.
As, mention by in above comment too .
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's unclear to me what the consequences are of this? Does not work at all? Does it mean all measurements are unreliable? Please clarify.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is unclear to me where to get file descriptor's address and using fd=-1 (as used when mmap called for aux memory) returns EBADF error in X8 register after syscall.
Function configurePerfCounter should set up performance monitoring perf_event_open.
To not include subprocess's setup code in benchmarking the instruction.
Yes, Currently all measurements are unreliable in subprocess mode for AArch64.
PS: Added warning for unimplemented configurePerfCounter and resultant unreliable measurements
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The performance counters do not work at all if these syscalls do not work. This probably needs to be fixed before this lands.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added implementation for configurePerfCounter()that load actual file descriptor from auxiliary memory location [address + 0]. The performance counters are now reset after the subprocess memory mode's setup code to remove the overhead and give reliable measurements.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mmap always returns an address. When you pass an address, there is no guarantee that will actually be the address of the mapping. The actual address of the mapping (or an error code) gets returned.
Exegesis currently only has support for mappings at a fixed address.
Given this flag is always set to true, we should just get rid of it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why would this need openat instead of open?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reverted the comment, Added required debug warning generateMmapAuxMem().
- Switched X16 as temporary register in loadFPCRImmediate instead of X8 which is used by syscalls - Updated Testcase with hardcoded reg number.
…re syscall registers and syscall generator
… for subprocess execution mode
…ty and update syscall argument handling and descriptive comments
28c44e0 to
4ab9412
Compare
… AArch64 and X86 targets
…mance counter config
|
Thanks for adding all of this. I am going to take a look now, and in the meantime wanted to leave these high level questions:
|
No, Also, Verified mmap functions implementation are working i.e. auxiliary mmap and manual snippet mmap both works so thats a win atleast.
Yes, we can debug the generated assembly by
Testcases for added functionalities, if i am missing anything below please tell
this be enough to test PR changes, right? [For completeness]
|
|
With how much higher the measurements are in subprocess mode, it seems like configuring the perf counter isn't actually doing anything. I don't believe there is any error handling there, so it could be that the system calls are just failing. I've been meaning to wire up error handling for that and mmap failures but have never gotten around to it. The best way I found to validate it was to single step through the assembly in a debugger. There's some flag that I think I remember putting somewhere to not have |
…k for AArch64 for execution with perf
…p asm in configurePerfCounter
…mapping and performance counters
…nfigurePerfCounter
|
Yes, ioctl syscall is failing due to file descriptor not present at expected auxiliary memory address. // llvm/tools/llvm-exegesis/lib/Assembler.cpp:105
// └─llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp:482
ioctl(
fd = [getAuxiliaryMemoryStartAddress()] = [VAddressSpaceCeiling - 2 * pagesize],
Request = PERF_EVENT_IOC_RESET,
arg = PERF_IOC_FLAG_GROUP
)Similarly, // llvm/tools/llvm-exegesis/lib/Assembler.cpp:
const MemoryValue &MemVal = Key.MemoryValues.at(MM.MemoryValueName);
BBF.addInstructions(ET.generateMmap(
MM.Address, MemVal.SizeBytes,
ET.getAuxiliaryMemoryStartAddress() +
sizeof(int) *
(MemVal.Index + SubprocessMemory::AuxiliaryMemoryOffset))); |
|
Current state of PR will enable the following :
Limitations:
And, for reference output of LD1B in subprocess mode for benchmarking latency:- $ build/bin/llvm-exegesis -mode=latency --execution-mode=subprocess --opcode-name=LD1B --debug-only="preview-gen-assembly"
Warning: generateMmapAuxMem using anonymous mapping
Warning: setStackRegisterToAuxMem called but not required for AArch64
Warning: configurePerfCounter ioctl syscall failing
Warning: configurePerfCounter ioctl syscall failing
Warning: generateMmapAuxMem using anonymous mapping
Warning: setStackRegisterToAuxMem called but not required for AArch64
Warning: configurePerfCounter ioctl syscall failing
Warning: configurePerfCounter ioctl syscall failing
Generated assembly snippet:
'''
0: fc1f0fed str d13, [sp, #-16]!
4: f90007f7 str x23, [sp, #8]
8: b26f77e0 mov x0, #140737488224256
c: d2820001 mov x1, #4096
10: d2800062 mov x2, #3
14: d2800423 mov x3, #33
18: f2a00203 movk x3, #16, lsl #16
1c: 92800004 mov x4, #-1
20: d2800005 mov x5, #0
24: d2801bc8 mov x8, #222
28: d4000001 svc #0
2c: f81f0fe0 str x0, [sp, #-16]!
30: 2518e3e6 ptrue p6.b
34: f84107ef ldr x15, [sp], #16
38: d2800017 mov x23, #0
3c: 2518e3e0 ptrue p0.b
40: 25f8c00d mov z13.d, #0
44: f81f0fe8 str x8, [sp, #-16]!
48: f81f0fe0 str x0, [sp, #-16]!
4c: f81f0fe1 str x1, [sp, #-16]!
50: f81f0fe2 str x2, [sp, #-16]!
54: b26f77f0 mov x16, #140737488224256
58: b9400200 ldr w0, [x16]
5c: d2848061 mov x1, #9219
60: d2800022 mov x2, #1
64: d28003a8 mov x8, #29
68: d4000001 svc #0
6c: f84107e2 ldr x2, [sp], #16
70: f84107e1 ldr x1, [sp], #16
74: f84107e0 ldr x0, [sp], #16
78: f84107e8 ldr x8, [sp], #16
7c: a41759f0 ld1b { z16.b }, p6/z, [x15, x23]
80: 244d0207 cmphs p7.h, p0/z, z16.h, z13.h
84: a41759f0 ld1b { z16.b }, p6/z, [x15, x23]
88: 244d0207 cmphs p7.h, p0/z, z16.h, z13.h
... (9994 more instructions)
9cb4: a41759f0 ld1b { z16.b }, p6/z, [x15, x23]
9cb8: 244d0207 cmphs p7.h, p0/z, z16.h, z13.h
9cbc: b26f77f0 mov x16, #140737488224256
9cc0: b9400200 ldr w0, [x16]
9cc4: d2848021 mov x1, #9217
9cc8: d2800022 mov x2, #1
9ccc: d28003a8 mov x8, #29
9cd0: d4000001 svc #0
9cd4: d2800000 mov x0, #0
9cd8: d2800ba8 mov x8, #93
9cdc: d4000001 svc #0
9ce0: f94007f7 ldr x23, [sp, #8]
9ce4: fc4107ed ldr d13, [sp], #16
9ce8: d65f03c0 ret
'''
---
mode: latency
key:
instructions:
- 'LD1B Z16 P6 X15 X23'
- 'CMPHS_PPzZZ_H P7 P0 Z16 Z13'
config: ''
register_initial_values:
- 'P6=0x0'
- 'X15=0x0'
- 'X23=0x0'
- 'P0=0x0'
- 'Z13=0x0'
cpu_name: neoverse-v2
llvm_triple: aarch64-unknown-linux-gnu
min_instructions: 10000
measurements:
- { key: latency, value: 7.2836, per_snippet_value: 14.5672, validation_counters: {} }
error: ''
info: Repeating two instructions
assembled_snippet: ED0F1FFCF70700F9E0776FB2010082D2620080D2230480D20302A0F204008092050080D2C81B80D2010000D4E00F1FF8E6E31825EF0741F8170080D2E0E318250DC0F825E80F1FF8E00F1FF8E10F1FF8E20F1FF8F0776FB2000240B9618084D2220080D2A80380D2010000D4E20741F8E10741F8E00741F8E80741F8F05917A407024D24F05917A407024D24F05917A407024D24F05917A407024D24F0776FB2000240B9218084D2220080D2A80380D2010000D4000080D2A80B80D2010000D4F70740F9ED0741FCC0035FD6
... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unreliable measurements due to ioctl syscall failing, file descriptor for perf event is not populated at auxiliary memory location where it is expected. (Inclined to take up this in future)
(FYI) All Memory mapping is anonymous, not a functional problem. (Inclined to take up this in future)
Both of these need to be fixed before this PR can land. It doesn't make sense to support subprocess execution mode in exegesis when getting a cycles count does not work at all. Keeping all memory mappings anonymous also breaks the semantics of how the memory is supposed to be managed.
|
Yes, Definitely would want support with reliable measurements and correct memory management. But i am stuck here ioctl and mmap requires the fd which are supposed to be at specific offset to auxiliary memory starting address as mentioned in penultimate comment. Thus, Can you please shed some light on usage of
PS: I had a bit hunch on this problem coming from using stack to save return value of aux mmap for loading first GPR register with memory address. Thus,saved same in temp register to be loaded for req. register, But this also have same measurement mismatch in subprocess and inprocess and LD1B for different |
We map two pages in at the end of the user-mode virtual address space for the auxiliary memory. I think we use the first page for the actual data we need, and the second page as a stack for setting up vector registers.
|
We want to support load and store instructions for aarch64 but currently they throw segmentation fault as register are initialized by 0 but load instruction requires register to be loaded with valid memory address.
Load registers requiring memory address
Prerequisite for this is to support --mode=subprocess for AArch64.
Adding support for --mode=subprocess and Memory Annotation of manual snippet.
Adding memory setup, required by subprocess for AArch64.
Implement Auxiliary memory mmap, manual snippet mmap and configurePerfCounter().
Functions for syscall generations.
Generating syscall for aux mmap, save its return value in stack and load required registers with memory address.
Implemented the same, currently.
TODO: how to differentiate between register requiring address and otherwise ?
For example: LD1B opcode (ld1b {z6.b}, p4/z, [x14, x2]) expects first register to contain address and second having offset value.
Current fix is Init first register queried by instruction to loaded by address and rest by setRegTo as done previously.
Enable load instructions in --mode=subprocess
ConfigurePerfCounter(): Reset perf counter to not include subprocess overhead in measurementsPlease review: @sjoerdmeijer, @boomanaiden154, @davemgreen
This is minimal patch required
Thanks,